library(tidyverse)
library(tidytuesdayR)
library(here)
library(PNWColors)
library(plotly)
library(ggmap)
library(sp)
library(maps)
library(maptools)
rm(list=ls())
### Google API
API<-names(read_table("API.txt"))
register_google(key = API) ### uses my API in separate txt fileairmen <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-02-08/airmen.csv')## Rows: 1006 Columns: 16
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (13): name, last_name, first_name, rank_at_graduation, class, graduated...
## dbl (1): number_of_aerial_victory_credits
## dttm (2): graduation_date, reported_lost_date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(airmen)## # A tibble: 6 x 16
## name last_name first_name graduation_date rank_at_graduat~ class
## <chr> <chr> <chr> <dttm> <chr> <chr>
## 1 Adams, John H~ Adams John H., ~ 1945-04-15 00:00:00 2nd Lt SE-4~
## 2 Adams, Paul Adams Paul 1943-04-29 00:00:00 2nd Lt SE-4~
## 3 Adkins, Ruthe~ Adkins Rutherfor~ 1944-10-16 00:00:00 2nd Lt SE-4~
## 4 Adkins, Winst~ Adkins Winston A. 1944-02-08 00:00:00 2nd Lt TE-4~
## 5 Alexander, Ha~ Alexander Halbert L. 1944-11-20 00:00:00 2nd Lt SE-4~
## 6 Alexander, Ha~ Alexander Harvey R. 1944-04-15 00:00:00 2nd Lt TE-4~
## # ... with 10 more variables: graduated_from <chr>, pilot_type <chr>,
## # military_hometown_of_record <chr>, state <chr>,
## # aerial_victory_credits <chr>, number_of_aerial_victory_credits <dbl>,
## # reported_lost <chr>, reported_lost_date <dttm>,
## # reported_lost_location <chr>, web_profile <chr>
# create character vector to use for geocode
stateID<-airmen %>%
unite(col = state, military_hometown_of_record, state, sep = ", ", remove = T) %>%
distinct(state) %>%
drop_na() %>%
as_vector
# geocode looks for lat and lon (with current output) of the state locations
#stateIDloc <- geocode(location = stateID, output = "latlon", source = "google")
# write csv to not have to run geocode again
#write_csv(stateIDloc, here("2022_Week6","Data","state_latlon.csv"))
# read in csv of lat and lon for hometowns
stateIDloc <- read_csv(here("2022_Week6","Data","state_latlon.csv"))## Rows: 390 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): lon, lat
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# bind geocode to stateID
stateID <- stateID %>%
cbind(stateIDloc) %>%
rename(state = '.')
# join lat lon to df
airmen <- airmen %>%
unite(col = state, military_hometown_of_record, state, sep = ", ", remove = T)
air_full <- airmen %>%
count(state) %>%
as_tibble() %>%
right_join(airmen) %>% # rejoin after count()
left_join(stateID, by = 'state') %>%
drop_na(state, rank_at_graduation, name) %>% # remove na's from columns of interest
filter(rank_at_graduation != "N/A",
state != "Unk") %>%
mutate(rank_at_graduation = str_replace_all(string = rank_at_graduation,
pattern = "Capt$", # the dollar sign indicates an exact match
replacement = "Captain")) # replace values for consistency## Joining, by = "state"
head(air_full)## # A tibble: 6 x 18
## state n name last_name first_name graduation_date rank_at_graduat~
## <chr> <int> <chr> <chr> <chr> <dttm> <chr>
## 1 Ahoskie~ 3 Reyn~ Reynolds Clarence ~ 1945-08-04 00:00:00 Flight Officer
## 2 Ahoskie~ 3 Smit~ Smith Graham 1942-07-03 00:00:00 2nd Lt
## 3 Ahoskie~ 3 Smit~ Smith Reginald ~ 1945-08-04 00:00:00 Flight Officer
## 4 Akron, ~ 1 McCl~ McClenic William B~ 1943-08-30 00:00:00 2nd Lt
## 5 Albany,~ 3 Blay~ Blaylock Joseph E. 1945-06-27 00:00:00 2nd Lt
## 6 Albany,~ 3 Hall~ Hall Richard W. 1943-07-28 00:00:00 2nd Lt
## # ... with 11 more variables: class <chr>, graduated_from <chr>,
## # pilot_type <chr>, aerial_victory_credits <chr>,
## # number_of_aerial_victory_credits <dbl>, reported_lost <chr>,
## # reported_lost_date <dttm>, reported_lost_location <chr>, web_profile <chr>,
## # lon <dbl>, lat <dbl>
# watercolor type
USmap <- get_map("US", maptype = "watercolor", zoom = 4)# Remove uncecessary columns and reduce grad year (for possible additional processing)
air_full <- air_full %>%
select(-c(pilot_type, aerial_victory_credits, number_of_aerial_victory_credits, reported_lost, reported_lost_date, reported_lost_location, web_profile)) %>%
separate(graduation_date, into = c('graduation_year','month','day'), sep = "-", remove = T) %>% # separate grad year from other date details
select(-c(month,day)) %>%
mutate(parA = "(", # create columns for parentheses to add to year
parB = ")") %>%
unite(col = graduation_year, parA, graduation_year, parB, sep="", remove = T) %>% # put grad year in parentheses
unite(col = name, name, graduation_year, sep=" ", remove = T) # add graduation year to name column
# total distinct states in dataframe
nstates <- stateID %>% count() %>% as.numeric
# create empty dataframe to store information in for loop
gradText <- tibble('state' = as.character(NA), # create empty state column
'text' = as.character(NA), # create empty new column
'rank_at_graduation' = as.character(NA)) # create empty rank at graduation column
# create df of single vector names by state and graduation rank
for (i in 1:nstates) {
temp<-air_full %>% # create placeholder dataframe with filtered data
filter(state == stateID[i,1]) %>%
group_by(rank_at_graduation) %>%
mutate(text = str_flatten(string = as.vector(name), collapse = "__")) %>% # combine rows into one character vector string
ungroup() %>%
select(state,text,rank_at_graduation) %>%
distinct()
gradText <- gradText %>% # add new rows to dataframe
rbind(temp) %>%
drop_na()
}
# create df with all names grouped by graduation rank per state
air_text <- gradText %>%
mutate(text = str_replace_all(string = text, pattern = "__", replacement = "\n ")) %>% # replace double underscore with new line
left_join(stateID) %>% # add lat and lon columns
unite(col = rank_text, rank_at_graduation, text, sep= ": \n", remove = T) %>%
group_by(state) %>%
mutate(rank_text = str_flatten(string = as.vector(rank_text), collapse = "\n \n")) %>% # bring all rows together to unite full state graduates
ungroup() %>%
unite(col = state_rank_text, state, rank_text, sep = "\n \n", remove = T) %>%
distinct() # remove duplicates## Joining, by = "state"